
/*
Get mean/sd, Nobs, Nppl by Early/Late sample period, separately for 
daughters, sons, mothers, fathers in regression samples.


Suggestion: do table like the one we had for decomposition, but unbalanced.
Income is for the IRP sample: sons=sons vs parental combined; daughters=daughters
vs parental combined; mothers= mother vs pooled kids; fathers=fathers
vs pooled kids. Log income is the same, but for the measure we use
for IGE (i.e. ``IGE sample''). Age at income and birth year is from
IRP sample. Education and employment from the decomposition sample.
Nobs and Nindiv from IRP sample. We can mention in text/table note/appendix
the fractions that remain from the IRP sample in the IGE and decomposition
samples.

Statistics are estimated over observations, not unique individuals.
*/

clear 
capture log close 


gl data_dir "P:\2018\186/use"
gl save_dir "P:\2018\186/Martin/Decompositions_SE_extLLSMM"
cd P:\2018\186

local yrmin1=1985 
local yrmax1=1995

local yrmin2=1996
local yrmax2=2007

local yrmin3=2008
local yrmax3=2019



*USE the ranked main sample DATA (children matched to mother or father)
use *idnr *earn* *yob *obsy *max* *edu* *emp* year woman if year>=1985 using "use\swepanel_zeros", replace

/*
set seed 12354586
gen r=runiform()		// Draw sample for speed
keep if r<0.2
*/
* Rename variable names

rename pmearn pm_LAB
rename pfearn pf_LAB
rename mearn m_LAB
rename fearn f_LAB
rename pearn pLAB
rename earn LAB

rename memp m_emplavg
rename femp f_emplavg
rename emp employ

rename medu m_schmax
rename fedu f_schmax
rename edu schmax

ren idnr newid
ren midnr m_newid
ren fidnr f_newid
ren woman female

gen AGEC=year-yob		// Age variables and normalization
gen AGEC1=AGEC-40		
g f_LABAGE_1=fobsy-fyob
g m_LABAGE_1=mobsy-myob
*egen fmage1=rowmean(agef1 agem1)
forval i=2/4{
	g AGEC`i'=AGEC1^`i'
	g f_LABAGE_`i'=f_LABAGE_1^`i'
	g m_LABAGE_`i'=m_LABAGE_1^`i'
	*g fmage`i'=fmage1^`i'
}

*Keep sample years
keep if inrange(year,`yrmin1',`yrmax3')

*Time periods
gen Period=.
replace Period=1 if year>=`yrmin1' & year<=`yrmax1'
replace Period=2 if year>=`yrmin2' & year<=`yrmax2'
replace Period=3 if year>=`yrmin3' & year<=`yrmax3'
label define periodvals 1 "`yrmin1'-`yrmax1'" 2 "`yrmin2'-`yrmax2'" 3 "`yrmin3'-`yrmax3'", replace
label values Period periodvals 
// only going to use periods 1, 3 for table, so could drop 2 here


*** Need to replace some values with missing so correct subsamples used for computing descriptives 

// NOTE: Starting point is IRP sample.

// IGE sample 

rename newid idnr
merge 1:1 idnr year using "P:\2018\186\use\swepanel_100usd.dta", keep(match master) keepusing(earn_c mearn_c fearn_c) nogen
/*
rename earn_c LOGLABc
rename mearn_c m_LOGLABc
rename fearn_c f_LOGLABc
*/
gen LOGLABc=log(earn_c)
gen m_LOGLABc=log(mearn_c)
gen f_LOGLABc=log(fearn_c)

rename idnr newid

replace LOGLABc=. if m_LOGLABc==. & f_LOGLABc==.
replace m_LOGLABc=. if LOGLABc==.
replace f_LOGLABc=. if LOGLABc==.
su LOGLABc m_LOGLABc f_LOGLABc

// Decomp samples , using children in mother OR father sample
	gen m_decomp=1 if (pm_LAB!=. & m_LABAGE_1!=. & m_emplavg!=. & m_schmax!=.) 
	gen f_decomp=1 if (pf_LAB!=. & f_LABAGE_1!=. & f_emplavg!=. & f_schmax!=.)
	 replace schmax=. 	if m_decomp!=1 & f_decomp!=1
	 replace employ=. 	if m_decomp!=1 & f_decomp!=1
	 replace m_schmax=. if m_decomp!=1
	 replace m_emplavg=. if m_decomp!=1
	 replace f_schmax=. if f_decomp!=1
	 replace f_emplavg=. if f_decomp!=1
	 su schmax employ m_schmax m_emplavg f_schmax f_emplavg
	 

rename yob cohort
rename myob m_cohort
rename fyob f_cohort

*Children
preserve 
	rename LAB Income
	rename LOGLABc LogIncome
	gen AgeAtIncome=year-cohort 
	rename schmax Education
	rename employ Employment 
	rename cohort BirthYear	
	egen flag1=tag(newid Period) // flag one unique obs per child
collapse (mean)  MeanIncome=Income MeanLogIncome=LogIncome MeanAgeIncome=AgeAtIncome MeanEducation=Education  MeanEmployment=Employment MeanBirthYear=BirthYear (sd) SDIncome=Income SDLogIncome=LogIncome SDAgeIncome=AgeAtIncome SDEducation=Education SDEmployment=Employment SDBirthYear=BirthYear (count) Nobs=Income (sum) Nindiv=flag1, by(Period female)
 gen person=.
	replace person=1 if female==1
	replace person=2 if female==0
tempfile child_descr
save `child_descr'
restore 


*Parents - Mothers, then fathers
local x=2
foreach p in m f {
local x=`x'+1
preserve 
	keep if pLAB!=. & `p'_LAB!=. // do not want to average over parent characteristics for parents not included in regression sample
	rename `p'_LAB Income
	rename `p'_LOGLABc LogIncome
	rename `p'_LABAGE_1 AgeAtIncome
	rename `p'_schmax Education
	rename `p'_emplavg Employment 
	rename `p'_cohort BirthYear
	egen flag1`p'=tag(`p'_newid Period) // flag one unique obs per parent 
collapse (mean)  MeanIncome=Income MeanLogIncome=LogIncome MeanAgeIncome=AgeAtIncome MeanEducation=Education  MeanEmployment=Employment MeanBirthYear=BirthYear (sd) SDIncome=Income SDLogIncome=LogIncome SDAgeIncome=AgeAtIncome SDEducation=Education SDEmployment=Employment SDBirthYear=BirthYear (count) Nobs=Income (sum) Nindiv=flag1, by(Period)
 gen person=`x'
tempfile `p'_descr
save ``p'_descr'
restore 
}


* FULL sample period statistics 

replace Period=9 if year>=`yrmin1' & year<=`yrmax3'

*Children
preserve 
	rename LAB Income
	rename LOGLABc LogIncome
	gen AgeAtIncome=year-cohort 
	rename schmax Education
	rename employ Employment 
	rename cohort BirthYear	
	egen flag1=tag(newid) // flag one unique obs per child
collapse (mean)  MeanIncome=Income MeanLogIncome=LogIncome MeanAgeIncome=AgeAtIncome MeanEducation=Education  MeanEmployment=Employment MeanBirthYear=BirthYear (sd) SDIncome=Income SDLogIncome=LogIncome SDAgeIncome=AgeAtIncome SDEducation=Education SDEmployment=Employment SDBirthYear=BirthYear (count) Nobs=Income (sum) Nindiv=flag1, by(Period female)
 gen person=.
	replace person=1 if female==1
	replace person=2 if female==0
tempfile child_descr9
save `child_descr9'
restore 


*Parents - Mothers, then fathers
local x=2
foreach p in m f {
local x=`x'+1
preserve 
	keep if pLAB!=. & `p'_LAB!=. // do not want to average over parent characteristics for parents not included in regression sample
	rename `p'_LAB Income
	rename `p'_LOGLABc LogIncome
	rename `p'_LABAGE_1 AgeAtIncome
	rename `p'_schmax Education
	rename `p'_emplavg Employment 
	rename `p'_cohort BirthYear
	egen flag1`p'=tag(`p'_newid) // flag one unique obs per parent 
collapse (mean)  MeanIncome=Income MeanLogIncome=LogIncome MeanAgeIncome=AgeAtIncome MeanEducation=Education  MeanEmployment=Employment MeanBirthYear=BirthYear (sd) SDIncome=Income SDLogIncome=LogIncome SDAgeIncome=AgeAtIncome SDEducation=Education SDEmployment=Employment SDBirthYear=BirthYear (count) Nobs=Income (sum) Nindiv=flag1, by(Period)
gen person=`x'
tempfile `p'_descr9
save ``p'_descr9'
restore 
}




*Combine
use `child_descr', clear
append using `m_descr'				
append using `f_descr'			
append using `child_descr9'		
append using `m_descr9'			
append using `f_descr9'			



*Reshape long to get list of MEANs SDs
rename (Nobs Nindiv) (SDNobs SDNindiv) // rename Ns to get them below SD of char in table 
drop female
reshape long Mean SD, i(person Period) j(Variable) string
order Mean SD, last

*Reshape wide to get periods wide
reshape wide Mean SD , i(person Variable) j(Period)

*"People" 
label define personvals 1 "Daughters" 2 "Sons" 3 "Mothers" 4 "Fathers"
label values person personvals

*Label variables 
label var Mean1 "Mean for `yrmin1'-`yrmax1'"
label var SD1 "SD for `yrmin1'-`yrmax1'"
label var Mean2 "Mean for `yrmin2'-`yrmax2'"
label var SD2 "SD for `yrmin2'-`yrmax2'"
label var Mean3 "Mean for `yrmin3'-`yrmax3'"
label var SD3 "SD for `yrmin3'-`yrmax3'"
label var Mean9 "Mean for `yrmin1'-`yrmax3'"
label var SD9 "SD for `yrmin1'-`yrmax3'"


*SAVE
save ${save_dir}/descr_byperiod_SE.dta, replace 




//clear
